home
***
CD-ROM
|
disk
|
FTP
|
other
***
search
/
The Atari Compendium
/
The Atari Compendium (Toad Computers) (1994).iso
/
files
/
compress
/
arc.zoo
/
arclzw.c
< prev
next >
Wrap
C/C++ Source or Header
|
1989-01-29
|
22KB
|
805 lines
/*
* $Header: arclzw.c,v 1.6 88/07/31 18:49:49 hyc Exp $
*/
/*
* ARC - Archive utility - ARCLZW
*
* Version 2.03, created on 10/24/86 at 11:46:22
*
* (C) COPYRIGHT 1985,86 by System Enhancement Associates; ALL RIGHTS RESERVED
*
* By: Thom Henderson
*
* Description: This file contains the routines used to implement Lempel-Zev
* data compression, which calls for building a coding table on the fly.
* This form of compression is especially good for encoding files which
* contain repeated strings, and can often give dramatic improvements over
* traditional Huffman SQueezing.
*
* Language: Computer Innovations Optimizing C86
*
* Programming notes: In this section I am drawing heavily on the COMPRESS
* program from UNIX. The basic method is taken from "A Technique for High
* Performance Data Compression", Terry A. Welch, IEEE Computer Vol 17, No 6
* (June 1984), pp 8-19. Also see "Knuth's Fundamental Algorithms", Donald
* Knuth, Vol 3, Section 6.4.
*
* As best as I can tell, this method works by tracing down a hash table of code
* strings where each entry has the property:
*
* if <string> <char> is in the table then <string> is in the table.
*/
#include <stdio.h>
#include "arc.h"
void putc_pak(), abort(), putc_ncr();
int getc_unp();
#if MSDOS
char *setmem();
#else
#ifndef __STDC__
char *memset();
#endif
#endif
static void putcode();
/* definitions for older style crunching */
#define FALSE 0
#define TRUE !FALSE
#define TABSIZE 4096
#define NO_PRED 0xFFFF
#define EMPTY 0xFFFF
#define NOT_FND 0xFFFF
static unsigned short inbuf; /* partial input code storage */
static int sp; /* current stack pointer */
struct entry { /* string table entry format */
char used; /* true when this entry is in use */
unsigned char follower; /* char following string */
unsigned short next; /* ptr to next in collision list */
unsigned short predecessor; /* code for preceeding string */
}; /* string_tab[TABSIZE]; the code string table */
/* definitions for the new dynamic Lempel-Zev crunching */
#define BITS 12 /* maximum bits per code */
#define HSIZE 5003 /* 80% occupancy */
#define INIT_BITS 9 /* initial number of bits/code */
static int n_bits; /* number of bits/code */
static int maxcode; /* maximum code, given n_bits */
#define MAXCODE(n) ((1<<(n)) - 1) /* maximum code calculation */
static int maxcodemax = 1 << BITS; /* largest possible code (+1) */
static char buf[BITS]; /* input/output buffer */
static unsigned char lmask[9] = /* left side masks */
{
0xff, 0xfe, 0xfc, 0xf8, 0xf0, 0xe0, 0xc0, 0x80, 0x00
};
static unsigned char rmask[9] = /* right side masks */
{
0x00, 0x01, 0x03, 0x07, 0x0f, 0x1f, 0x3f, 0x7f, 0xff
};
static int offset; /* byte offset for code output */
static long in_count; /* length of input */
static long bytes_out; /* length of compressed output */
static long bytes_ref; /* output quality reference */
static long bytes_last; /* output size at last checkpoint */
static unsigned short ent;
/*
* To save much memory (which we badly need at this point), we overlay the
* table used by the previous version of Lempel-Zev with those used by the
* new version. Since no two of these routines will be used together, we can
* safely do this.
*/
extern long htab[HSIZE]; /* hash code table (crunch) */
extern unsigned short codetab[HSIZE]; /* string code table (crunch) */
static struct entry *string_tab=(struct entry *)htab; /* old crunch string table */
static unsigned short *prefix=codetab; /* prefix code table (uncrunch) */
static unsigned char *suffix=(unsigned char *)htab; /* suffix table (uncrunch) */
static int free_ent; /* first unused entry */
static int firstcmp; /* true at start of compression */
extern unsigned char stack[HSIZE]; /* local push/pop stack */
/*
* block compression parameters -- after all codes are used up, and
* compression rate changes, start over.
*/
static int clear_flg;
#define CHECK_GAP 2048 /* ratio check interval */
static long checkpoint;
void upd_tab();
/*
* the next two codes should not be changed lightly, as they must not lie
* within the contiguous general code space.
*/
#define FIRST 257 /* first free entry */
#define CLEAR 256 /* table clear output code */
/*
* The cl_block() routine is called at each checkpoint to determine if
* compression would likely improve by resetting the code table. The method
* chosen to determine this is based on empirical observation that, in
* general, every 2k of input data should compress at least as well as the
* first 2k of input.
*/
static void
cl_block(t) /* table clear for block compress */
FILE *t; /* our output file */
{
checkpoint = in_count + CHECK_GAP;
if (bytes_ref) {
if (bytes_out - bytes_last > bytes_ref) {
setmem(htab, HSIZE * sizeof(long), 0xff);
free_ent = FIRST;
clear_flg = 1;
putcode(CLEAR, t);
bytes_ref = 0;
}
} else
bytes_ref = bytes_out - bytes_last;
bytes_last = bytes_out; /* remember where we were */
}
/*****************************************************************
*
* Output a given code.
* Inputs:
* code: A n_bits-bit integer. If == -1, then EOF. This assumes
* that n_bits =< (LONG)wordsize - 1.
* Outputs:
* Outputs code to the file.
* Assumptions:
* Chars are 8 bits long.
* Algorithm:
* Maintain a BITS character long buffer (so that 8 codes will
* fit in it exactly). When the buffer fills up empty it and start over.
*/
static void
putcode(code, t) /* output a code */
int code; /* code to output */
FILE *t; /* where to put it */
{
int r_off = offset; /* right offset */
int bits = n_bits; /* bits to go */
char *bp = buf; /* buffer pointer */
int n; /* index */
register int ztmp;
if (code >= 0) { /* if a real code *//* Get to the first byte. */
bp += (r_off >> 3);
r_off &= 7;
/*
* Since code is always >= 8 bits, only need to mask the
* first hunk on the left.
*/
ztmp = (code << r_off) & lmask[r_off];
*bp = (*bp & rmask[r_off]) | ztmp;
bp++;
bits -= (8 - r_off);
code >>= (8 - r_off);
/* Get any 8 bit parts in the middle (<=1 for up to 16 bits). */
if (bits >= 8) {
*bp++ = code;
code >>= 8;
bits -= 8;
}
/* Last bits. */
if (bits)
*bp = code;
offset += n_bits;
if (offset == (n_bits << 3)) {
bp = buf;
bits = n_bits;
bytes_out += bits;
do
putc_pak(*bp++, t);
while (--bits);
offset = 0;
}
/*
* If the next entry is going to be too big for the code
* size, then increase it, if possible.
*/
if (free_ent > maxcode || clear_flg > 0) {
/*
* Write the whole buffer, because the input side
* won't discover the size increase until after
* it has read it.
*/
if (offset > 0) {
bp = buf; /* reset pointer for writing */
bytes_out += n = n_bits;
while (n--)
putc_pak(*bp++, t);
}
offset = 0;
if (clear_flg) { /* reset if clearing */
maxcode = MAXCODE(n_bits = INIT_BITS);
clear_flg = 0;
} else {/* else use more bits */
n_bits++;
if (n_bits == BITS)
maxcode = maxcodemax;
else
maxcode = MAXCODE(n_bits);
}
}
} else { /* dump the buffer on EOF */
bytes_out += n = (offset + 7) / 8;
if (offset > 0)
while (n--)
putc_pak(*bp++, t);
offset = 0;
}
}
/*****************************************************************
*
* Read one code from the standard input. If EOF, return -1.
* Inputs:
* cmpin
* Outputs:
* code or -1 is returned.
*/
static int
getcode(f) /* get a code */
FILE *f; /* file to get from */
{
int code;
static int offset = 0, size = 0;
int r_off, bits;
unsigned char *bp = (unsigned char *) buf;
if (clear_flg > 0 || offset >= size || free_ent > maxcode) {
/*
* If the next entry will be too big for the current code
* size, then we must increase the size. This implies
* reading a new buffer f